clear
set more off
capture log close
version 14

global d "D:\Home\aejack\Data\SpendingStudy\DataDocumentation\v2"
global j "D:\Home\aejack\Research\NewTechnologies\TillReceiptScanning\junk2"



* Participation in Spending Study
************************************

*RQ3: Pattern of participation over time
*****************************************



* get dates on receipts
***************************

use $d/hdr.dta, clear
gen date = date(Date,"DM20Y")
list Date date
format date %td
list Date date
lab var date "date on receipt"
keep date Doc_ID
rename Doc_ID image1Page
sort image1Page
save $j/receiptdate, replace


* get analysis data



use $j/cr-7, clear
svyset i_psu, strata(i_strata)

* drop those found ineligible for the scanning study
drop if eligible==0
tab eligible

* drop IP9 non-respondents
tab ip9
tab ip9, nolab
keep if ip9==1


* drop non-participants in Spending Study
keep if everapp_dv==1
count

* keep incomeplete entries (EntryType)?
*tab EntryType

sort image1Page
duplicates report image1Page
duplicates tag image1Page, gen(duplicates)
list image* if duplicates ==1  
// duplicates = 6003 - cases where image1Page is emtpy (not a receipt scan)
list image*
// copy image2Photo into image1Photo for these two cases
// image2Photo = image1Photo for all other cases
replace image1Page = image2Page if duplicates ==1
drop duplicates
duplicates tag image1Page, gen(duplicates)
list image1Page if duplicates >1
drop duplicates


merge m:1 image1Page using $j/receiptdate 
drop if _merge==2
drop _merge


* drop observations where the date on the receipt is before 21/Oct/2016 
* (email invitations to scanning study sent)
drop if date<td(21oct2016)
// 34 observations dropped where the date of the receipt is before 21st Oct

* tag one obs per person
bysort pidp (starttime): gen tag2=_n==1

sort pidp starttime
*list day_dv starttime , sepby(pidp)

* generate day of scanning (1=first day, cut off at 35)
bysort pidp (starttime): gen scanday_dv = day_dv - day_dv[1]+1
lab var scanday_dv "day, since first day of app use"
sort pidp starttime
*list day_dv starttime scanday_dv, sepby(pidp)

* first day of scanning - which calendar days?
tab starttime if scanday_dv==1

* merge on registration survey: frequency of purchases
sort pidp
merge m:1 pidp using $d/reg, keepusing(pidp purchase_freq)
drop if _merge==2
tab purchase_freq if _merge==1
tab _merge if tag2==1 // purchase_freq missing for 4 persons
drop _merge 

recode purchase_freq (1=1 "several times a day") (2=2 "about once a day") (3 4 5 6=3 "less than once a day"), gen(pfreq)
tab purchase_freq pfreq, miss
lab var pfreq "purchase frequency"
tab pfreq if tag2

tab scanday activitytype if activitytype<3 & scanday<36
tab scanday activitytype if activitytype<3 & pfreq==1 & scanday<36
tab scanday activitytype if activitytype<3 & pfreq==2 & scanday<36
tab scanday activitytype if activitytype<3 & pfreq==3 & scanday<36



* number of respondents using the app per day
*--------------------------------------------------
bysort pidp scanday_dv (starttime): gen persdaytag = _n==1 if scanday<.
lab var persdaytag "tags one obs per person and day"
sort pidp scanday starttime
*list starttime scanday persdaytag, sepby(pidp)
tab scanday if persdaytag==1 & scanday<36

* normalise so that day 1 = 100%
tab scanday if persdaytag==1 & scanday<36, matcell(Nperday)
matrix list Nperday
matrix Empty = J(35,2,.)

matrix Nperday = Nperday,Empty
matrix colnames Nperday = N day percentN
forvalues n=1/35 {
	matrix Nperday[`n',2]=`n'
	matrix Nperday[`n',3]=Nperday[`n',1]/270*100
	}
matrix list Nperday
* add "survivor function" and create combined graph below



* mean number of app uses per day/respondent
*-------------------------------------------------
* create frame file with 35 obs for each participant
preserve
keep if everapp_dv==1
keep pidp 
duplicates drop 
expand 35
bysort pidp: gen scanday_dv = _n
sort pidp scanday_dv
save $j/pidpdays, replace
restore

* generate count of number of app uses per day
bysort pidp scanday_dv (starttime): gen nuses_dv = _N if scanday_dv<.
* generate count of number of scans and purchases entered per day
gen purch = activitytype==1 | activitytype==2
tab activitytype purch, miss
bysort pidp scanday_dv (starttime): egen npurchases_dv = sum(purch)
list scanday_dv activitytype nuses purch npurchases_dv persdaytag, sepby(pidp)
keep if persdaytag==1 // keep 1 obs per person and day

keep pidp scanday_dv nuses_dv npurchases_dv pfreq i_psu i_strata
sort pidp scanday_dv
duplicates report pidp scanday_dv

merge 1:1 pidp scanday_dv using $j/pidpdays
tab scanday if _merge==1 // scanday > 36
drop if _merge==1
list if _merge==2, sepby(pidp)  // days in which app was not used

replace nuses_dv=0 if _merge==2
replace npurchases_dv=0 if _merge==2
sort pidp pfreq
bysort pidp: replace pfreq = pfreq[1] if pfreq==.
bysort pidp (scanday_dv): replace i_psu = i_psu[1] if i_psu==.
bysort pidp (scanday_dv): replace i_strata = i_strata[1] if i_strata==.
inspect i_psu i_strata
sort pidp scanday 
list scanday_dv-npurchases_dv, sepby(pidp)

drop _merge
assert npurchases<=nuses
tab scanday_dv

* for each day, create indicator of whether person scanned again on a future day
bysort pidp (scanday_dv): gen sumnuses_dv = sum(nuses_dv)
sort pidp scanday
list scanday nuses sumnuses, sepby(pidp)

bysort pidp: gen survive = sumnuses_dv<sumnuses_dv[35]
list scanday nuses sumnuses survive, sepby(pidp)
tab scanday_dv survive, matcell(Survive)

matrix list Survive
matrix Empty = J(35,1,.)
matrix Survive = Survive,Empty
matrix colnames Survive = Nout Nsurvive percentSurv 
forvalues n=1/35 {
	matrix Survive[`n',3]=Survive[`n',2]/270*100
	}
matrix list Survive



* add mean number of app uses and purchses reported per day to the matrix for graphs

mean nuses_dv, over(scanday_dv)
matrix Nuses = e(b)'
matrix colnames Nuses = Nuses
matrix list Nuses

mean npurchases_dv, over(scanday_dv)
matrix Npurchases = e(b)'
matrix colnames Npurchases = Npurchases
matrix list Npurchases


* add mean number of purchases reported by frequency of shopping from registration survey
* confidence intervals reported in text for Figure 2 are from here:
mean npurchases if pfreq==1 & scanday<36, over(scanday_dv)
matrix Npurchases1 = e(b)'
matrix colnames Npurchases1 = Npurchases1

mean npurchases if pfreq==2 & scanday<36, over(scanday_dv)
matrix Npurchases2 = e(b)'
matrix colnames Npurchases2 = Npurchases2

mean npurchases if pfreq==3 & scanday<36, over(scanday_dv)
matrix Npurchases3 = e(b)'
matrix colnames Npurchases3 = Npurchases3

* test difference in mean number of scans/purchases between the 3 shopper frequency groups
* pooling days

svyset i_psu, strata(i_strata) singleunit(centered)  

svy: mean npurchases, over(pfreq)

test [npurchases_dv]_subpop_1 = [npurchases_dv]_subpop_2
test [npurchases_dv]_subpop_1 = [npurchases_dv]_subpop_3
test [npurchases_dv]_subpop_2 = [npurchases_dv]_subpop_3

 
matrix All = Nperday,Survive,Nuses,Npurchases,Npurchases1,Npurchases2,Npurchases3
matrix list All

svmat All, names(col)
lab var percentN "Participants used app"
lab var percentSurv "Participants continued in study"
lab var Nuses "Total app uses"
lab var Npurchases "Scans and purchases"
lab var Npurchases1 "Spends several times a day"
lab var Npurchases2 "Spends about once a day"
lab var Npurchases3 "Spends less than once a day"


* Figure 1:
graph twoway line percentN day if day<32 ||  line percentSurv day if day<32, ylabel(10(10)100) xtick(1(1)31) xlabel(1(2)31) ytitle("Percent") xtitle("Day") scheme(s2mono)
* numbers corresponding to graphs for text:
list day percentN percentSurv in 1/35



* not in paper:
graph twoway line Npurchases day if day<32||  line Nuses day if day<32, ytitle("Mean") xtitle("Day") scheme(s2mono) xtick(1(1)31) xlabel(1(2)31) ytick(0(.1)2) 

* numbers corresponding to graphs for text:
list day Npurchases Nuses in 1/35

* Figure 2:
graph twoway line Npurchases1 day if day<32 ||  line Npurchases2 day  if day<32||  line Npurchases3 day  if day<32, ///
	ytitle("Mean")	xtitle("Day") scheme(s2mono) xtick(1(1)31) xlabel(1(2)31) ytick(0(.1)3) ylabel(0.5(0.5)3) 

* confidence intervals for npurchases by freqeuncy of purcahsing and day
svy: mean npurchases if pfreq==1
	
	

* mean, median, max number of app uses per respondent and day
**************************************************************
summarize nuses, detail 
table scanday, c(mean nuses)
table scanday, c(mean npurchases)


* frequency of shopping reported in the registration survey
tab pfreq if scanday_dv==1

exit

 
 